Main target of this project is to create machine learning model which will be able to classify financial transaction as fraud or non fraud. Data will be cleaned, preprocessed and then balanced. All of these processes will improve model perfomance and classification accuracy. Dataset provided in link above includes about 284 thousand cases with only 492 fraud transaction. Every row of dataset contains information about time and amount of transactions. There are 28 more columns which are components produced by PCA preprocessing algorithm.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import *
%matplotlib inline
fraud_dataset = pd.read_csv('/data/notebook_files/creditcard.csv')
fraud_dataset.head()
fraud_dataset.describe().transpose()
pd.DataFrame(fraud_dataset.columns)
print("Target class ratio:")
pd.DataFrame({'Amount':fraud_dataset['Class'].value_counts(),
'Ratio': [round(fraud_dataset['Class'].value_counts()[0]/fraud_dataset['Class'].count()*100, 2)
, round(fraud_dataset['Class'].value_counts()[1]/fraud_dataset['Class'].count()*100, 2)]} )
pd.DataFrame({"Ammount of Nan": fraud_dataset.isna().sum()})
sns.set()
fig = plt.figure(figsize=(6, 4))
ax = fig.add_axes((.1, .4, .8, .5))
ax = sns.countplot(x=fraud_dataset['Class'])
ax.set_title("Class distribution")
ax.set_yscale("log")
plt.show()
x1 = fraud_dataset['Amount'].values
x2 = fraud_dataset['Time'].values
fig , axes = plt.subplots(1, 2,figsize=(10, 4))
sns.kdeplot(ax=axes[0], shade=True, x=x1, color='b')
axes[0].set_title("Distribution of Amount")
sns.kdeplot(ax=axes[1], shade=True, x=x2, color='r')
axes[1].set_title("Distribution of Time")
plt.show()
v_vars = pd.DataFrame(fraud_dataset.drop(['Amount', 'Time', 'Class'], axis=1))
fig, axes = plt.subplots(4, 7, figsize=(20, 16))
fig.suptitle("Distribution of 'V' Variables", fontsize=30)
k = 1
for i in range(4):
for y in range(7):
v = v_vars[f"V{k}"]
sns.kdeplot(ax=axes[i][y], shade=True, x=v, color='b')
axes[i][y].set_title(f"Distribution of V{k}")
axes[i][y].set_ylabel("")
axes[i][y].set_xlabel("")
k += 1
plt.show()
fig = plt.figure(figsize=(20, 12))
ax = sns.heatmap(fraud_dataset.corr(method='spearman'), cmap="YlGnBu")
ax.set_title("Correllation map", fontsize=30)
plt.show()
fraud = fraud_dataset[fraud_dataset["Class"]==1]
non_fraud = fraud_dataset[fraud_dataset['Class']==0]
print(f"Fraud transactions: {len(fraud)}\nNon fraud transactions: {len(non_fraud)}")
Inballance between fraud and non fraud cases is way too big. There two main techniques to deal with this type of datasets - oversampling and undersampling. Main con of undersampling is that we are losing information. Oversampling is not ideal at - we are randomly copying same cases so it may cause poor performance of machine learning model. In this case the ratio between class 0 and class 1 is 99.83 to 0.17. It means that we can't use oversampling with original size of dataset.In first step we will drop random records of non fraud cases. 1000 records will be saved and then we will oversample fraud subset. That gives us dataset with 2000 records and perfect balance between target classes.
print("Oversampling and undersampling")
final_size = 1000
fraud_oversample = fraud.sample(final_size, replace=True)
non_fraud_undersample = non_fraud.sample(final_size)
print(f"Fraud subset size: {fraud_oversample.shape}\nNon fraud subset size: {non_fraud_undersample.shape}")
print("Concatenating subsets")
fraud_dataset_balanced = pd.concat([fraud_oversample, non_fraud_undersample], sort=True)
pd.DataFrame({'Amount':fraud_dataset_balanced['Class'].value_counts(),
'Ratio': [round(fraud_dataset_balanced['Class'].value_counts()[0]/fraud_dataset_balanced['Class'].count()*100, 2)
, round(fraud_dataset_balanced['Class'].value_counts()[1]/fraud_dataset_balanced['Class'].count()*100, 2)]} )
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
sns.countplot(ax=axes[0], x=fraud_dataset['Class'])
axes[0].set_title("Class Distribution Before Sampling")
axes[0].set_yscale("log")
sns.countplot(ax=axes[1], x=fraud_dataset_balanced['Class'])
axes[1].set_title("Class Distribution After Sampling")
plt.show()
x1 = fraud_dataset['Amount'].values
x2 = fraud_dataset['Time'].values
x3 = fraud_dataset_balanced['Amount'].values
x4 = fraud_dataset_balanced['Time'].values
fig, axes = plt.subplots(2, 2, figsize=(10 ,10))
sns.kdeplot(ax=axes[0][0], shade=True, x=x1, color='r')
axes[0][0].set_title("Distribution of Amount Beofre Sampling")
sns.kdeplot(ax=axes[0][1], shade=True, x=x2, color='r')
axes[0][1].set_title("Distribution of Time Beofre Sampling", loc="right")
sns.kdeplot(ax=axes[1][0], shade=True, x=x3, color='g')
axes[1][0].set_title("Distribution of Amount After Sampling")
sns.kdeplot(ax=axes[1][1], shade=True, x=x4, color='g')
axes[1][1].set_title("Distribution of Time After Sampling", loc="right")
plt.show()
v_vars = pd.DataFrame(fraud_dataset_balanced.drop(['Amount', 'Time', 'Class'], axis=1))
fig, axes = plt.subplots(4, 7, figsize=(20, 16))
fig.suptitle("Distribution of 'V' Variables After Sampling", fontsize=30)
k = 1
for i in range(4):
for y in range(7):
v = v_vars[f"V{k}"]
sns.kdeplot(ax=axes[i][y], shade=True, x=v, color='b')
axes[i][y].set_title(f"Distribution of V{k}")
axes[i][y].set_ylabel("")
axes[i][y].set_xlabel("")
k += 1
plt.show()
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
fig.suptitle("Correlation Map Before and After Sampling", fontsize=30)
sns.heatmap(ax=axes[0], data=fraud_dataset.corr(method='spearman'), cmap="YlGnBu")
axes[0].set_title("Before ", fontsize=20)
sns.heatmap(ax=axes[1], data=fraud_dataset_balanced.corr(), cmap="YlGnBu")
axes[1].set_title("After", fontsize=20)
plt.show()
Charts above sohws that sampling process hasn't significantly changed distributions of idependent variables. What realy has been changed is relation between particular variables. Reliationships within the data became more visible and it would be easier to select most significant variables. For sure that will improve model performance.
This dataset provides 30 features, so it would be necessary to pick most important ones. Skipping this proces can make model outcome less significant or overfitting can occur.
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
X = fraud_dataset_balanced.drop('Class', axis=1).values.tolist()
y = fraud_dataset_balanced['Class'].values.tolist()
feature_selection = SelectKBest(score_func=f_classif, k='all')
feature_selection.fit(X, y)
X = feature_selection.transform(X)
selection_results = pd.DataFrame({"Variable": fraud_dataset_balanced.drop("Class", axis=1).columns,
"Score": feature_selection.scores_,
"P-Value": feature_selection.pvalues_})
selection_results
fig, axes = plt.subplots(2, 1, figsize=(20, 9))
fig.suptitle("Selection Results", fontsize=30)
axes[0].bar(selection_results['Variable'].values, selection_results['Score'].values)
axes[0].set_title("Selection Scores")
axes[1].bar(selection_results['Variable'].values, selection_results['P-Value'].values)
axes[1].set_title("P-Values")
plt.show()
selection_results = selection_results[selection_results['P-Value']<0.005]
selected_columns = selection_results['Variable'].values.tolist()
X = fraud_dataset_balanced[selected_columns].values.tolist()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
predictions_logistic = logistic.predict(X_test)
from sklearn.metrics import confusion_matrix, classification_report, plot_roc_curve, accuracy_score, average_precision_score, log_loss
fig = plt.figure(figsize=(6, 6))
ax = sns.heatmap(confusion_matrix(y_test, predictions_logistic), annot=True, cmap="YlGnBu")
ax.set_title("Confussion Matrix")
plt.show()
pd.DataFrame({"Metric":['Accuracy', 'Avg Precision', 'Log loss'],
"Score":[accuracy_score(y_test, predictions_logistic),
round(average_precision_score(y_test, predictions_logistic), 2),
round(log_loss(y_test, predictions_logistic), 2)]})
print(classification_report(y_test, predictions_logistic))
pd.DataFrame({"Intercept":logistic.intercept_})
coefs = pd.DataFrame({"Variable": selected_columns, "Coefficient": logistic.coef_[0]})
coefs
fig = plt.figure(figsize=(20, 6))
fig.suptitle("Model Coefficients", fontsize=30)
ax = plt.bar(selection_results['Variable'].values, coefs["Coefficient"].values)
plt.show()
plot_roc_curve(logistic, X_test, y_test)
plt.show()
Best model accuracy evolved to a level of 0.95%. Other metrics of model's perfomance are very promising too. ROC curve suggests optimal sensitivity-specificity trade-off, best threshold level around 0.8. Area under the curve ensure that model is able to distinguish fraud and non-fraud transactions perfectly.
coefs_and_odds = coefs
coefs_and_odds['Odds'] = np.exp(coefs_and_odds["Coefficient"])
coefs_and_odds.sort_values(by=["Coefficient"])
fig = plt.figure(figsize=(20, 6))
fig.suptitle("Exponential Odds of Each Variable", fontsize=30)
ax = plt.bar(selection_results['Variable'].values, coefs_and_odds["Odds"].values, color="r")
plt.show()